#!/usr/bin/env perl

# 使用nanopolish工具分析甲基化
# nanopolish工具本身安装就有一定曲折。一切周折过后，nanopolish自带的python脚本获取单位点甲基化信息，必须把整个Reads碱基甲基化信息全部读完才可以输出，这一步也非常限速。
# 本脚本专门封装 nanopolish call-methylation，将针对全基因组的分析拆分为单个区间，之后分别提取甲基化信息，最后汇总全部结果

use strict;
use warnings;
use Getopt::Long;
use Cwd;
use Cwd 'abs_path';
use FindBin qw($Bin);
use File::Basename;
use FileHandle;
use Parallel::ForkManager;

# 程序路径与关键环境变量
our $workdir=getcwd(); # 工作路径，为输出文件、日志文件存储的文件夹，默认当前路径
$ENV{'HDF5_PLUGIN_PATH'} = '/usr/local/hdf5/lib/plugin'; # 这是一个坑，我在简书中会介绍
our $nanopolish = "$Bin/nanopolish";
our $MethFreq = "$Bin/scripts/calculate_methylation_frequency.py";
our $bedtools = &GetTools('bedtools');
our $samtools = &GetTools('samtools');
our $tabix = &GetTools('tabix');

# 输入参数
our $reads = (); # 务必要跟fast5用 nanopolish index 做好关联！！！
our $bam = ();
our $ref = ();
our $out = 'results'; # 输出结果
our $target = (); # 分析的靶向区间
our $CPU = `grep processor /proc/cpuinfo |wc -l`; chomp($CPU); # 总CPU数
our $Njob = 12; # 同时并行多少个小区间
our $bin = 200000; # 并行的小区间有多大
our $SplitGroups = (); # calculate_methylation_frequency.py 的 --split-groups 参数。
our $tmpDir = ();
our $keepTmp;
our $options = ();
our $help;

# bedtools makewindows -g ./ncbi_Phaeodactylum_tricornutum/ptr.fa.fai -w 200000

GetOptions(
 'i|reads=s' => \$reads,
 'b|bam=s' => \$bam,
 'r|ref=s' => \$ref,
 'o|out=s' => \$out,
 't|target=s' => \$target,
 'w|window=i' => \$bin,
 'cpu=i' => \$CPU,
 'j|job=i' => \$Njob,
 's|split-groups!' => \$SplitGroups,
 'options=s' => \$options,
 'tmp=s' => \$tmpDir,
 'k|keep!' => \$keepTmp,
 'h|help' => \$help
);


my $Mannual=<<H;
perl $0
  封装 nanopolish call-methylation ，实现并行化
    -r|ref              参考基因组
    -i|reads            输入ONT的长reads，务必与其对应的fast5用 nanopolish index 做好关联
    -b|bam              输入的bam
    -o|out              输出的甲基化结果文件前缀（默认$out)
    -t|target           待检测的基因组靶向区间（默认整个基因组）

    -cpu                使用的总线程数（默认节点上全部$CPU个线程）
    -j|jobs             同时并行多少个任务（默认$Njob)
    -w|window           每一个任务，基因组区间大小（默认$bin）
    -tmp                临时文件路径（默认输出文件前缀.tmp）
    -s|split-group      -s表示启用calculate_methylation_frequency.py 的 --split-groups 参数。
    -k|keep             -k表示程序运行结束，不删除临时文件
    -options            nanopolish call-methylation的其他运行参数

    -h|help             Show this message
  关于本脚本配套的一系列操作，可参考下面：
    简书：https://www.jianshu.com/p/60d620a166fa
    gitee：https://gitee.com/wangshun1121/nanopolish-methyl
H

unless($reads and $bam and $ref){
    print $Mannual;
    exit();
}else{
    $reads = abs_path($reads);
    &CheckReads;
    
    $bam = abs_path($bam);
    unless(-e "$bam.bai"){
        system("$samtools index -@ 8 $bam");
    }
    
    $ref = abs_path($ref);
    unless(-e "$ref.fai"){
        system("$samtools faidx $ref");
    }
}

our $threads = int($CPU/$Njob);

if($help){
    print $Mannual;
    exit();
}elsif($options){
    if($options eq '-h'){
        system("$nanopolish call-methylation -h");
        exit();
    }
}

unless($target){$target = "$ref.fai";}

unless($tmpDir){$tmpDir = "$workdir/$out\.tmp";}
unless(-e "$tmpDir"){system("mkdir -p $tmpDir");}

# 1、拆分目标区间
my @Targets=();
open O,"|bgzip -c >$tmpDir/targets.bed.gz";
if($target eq "$ref.fai"){
    open I,"$bedtools makewindows -g $target -w $bin|";
}else{
    open I,"$bedtools sort -g $target -i $target|$bedtools merge -i -|$bedtools makewindows -b $target -w $bin|";
}
while(<I>){
    print O $_;
    chomp;
    my @a=split/\t/;
    $_ = "$a[0]\:$a[1]\-$a[2]";
    push(@Targets,$_);
}
close I;
close O;

# 2、甲基化并行分析
my $pm=new Parallel::ForkManager($Njob);
foreach my $Target(@Targets){
    my $pid=$pm->start and next;
    &CallMeth($Target);
    $pm->finish;
}
$pm->wait_all_children;

# 3、结合汇总
open O1,"|bgzip -c >$out.meth_call.tsv.gz";
open O2,"|bgzip -c >$out.meth_freq.tsv.gz";
for(my $i=0;$i<scalar(@Targets);$i++){
    open I,"gunzip -c $tmpDir/$Targets[$i].meth_call.tsv.gz|";
    if($i == 0){
        $_=<I>;
        print O1 $_;
    }else{<I>;}
    while(<I>){
        print O1 $_;
    }
    close I;
    
    my %Plus = &MethFreq2Hash("$tmpDir/$Targets[$i].meth_freq.plus.tsv");
    my %Minus = &MethFreq2Hash("$tmpDir/$Targets[$i].meth_freq.minus.tsv");
    
    open I,"$tmpDir/$Targets[$i].meth_freq.tsv";
    if($i == 0){
        $_=<I>;
        chomp;
        print O2 $_."\tcalled_sites.plus\tcalled_sites_methylated.plus\tmethylated_frequency.plus\tcalled_sites.minus\tcalled_sites_methylated.minus\tmethylated_frequency.minus\n";
    }else{<I>;}
    while(<I>){
        chomp;
        print O2 $_;
        my @a=split/\t/;
        my $Key = "$a[0]\t$a[1]\t$a[2]\t$a[3]\t$a[-1]";
        if($Plus{$Key}){
            print O2 "\t$Plus{$Key}";
        }else{
            print O2 "\t0\t0\tNA";
        }
        if($Minus{$Key}){
            print O2 "\t$Minus{$Key}";
        }else{
            print O2 "\t0\t0\tNA";
        }
        print O2 "\n";
    }
    close I;
}
close O2;
close O1;
system("$tabix -p bed -S 1 $out.meth_call.tsv.gz");
system("$tabix -p bed -S 1 $out.meth_freq.tsv.gz");
unless($keepTmp){system("rm -rf $tmpDir");}


# 子函数
sub GetTools{
    # 从环境变量中获取特定工具的路径
    my $tool=shift;
    my $toolpath = `which $tool`;
    chomp($toolpath);
    unless($toolpath){
        print "Can not find $tool, please check! \n";
        exit;
    }
    return($toolpath);
}

sub CheckReads{
    my $OK = 1;
    unless(-e "$reads.index"){
        print "No $reads.index, input reads should be indexed with its fast5 files using nanopolish index! \n";
        exit;
    }
    unless(-e "$reads.index.fai"){
        print "No $reads.index.fai, input reads should be indexed with its fast5 files using nanopolish index! \n";
        exit;
    }
    unless(-e "$reads.index.gzi"){
        print "No $reads.index.gzi, input reads should be indexed with its fast5 files using nanopolish index! \n";
        exit;
    }
    unless(-e "$reads.index.readdb"){
        print "No $reads.index.readdb, input reads should be indexed with its fast5 files using nanopolish index! \n";
        exit;
    }
}

sub CallMeth{
    # 分析甲基化
    my $Target=shift;
    my $CMD = "$nanopolish call-methylation -t $threads -r $reads -b $bam -g $ref -w $Target";
    if($options){$CMD .= " $options";}
    my $AWK = 'awk -F \'\t\' \'{print $1"\t"$3"\t"$4"\t"$5"\t"$11"\t"$2"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}\'';
    
    open O,"|bgzip -c > $tmpDir/$Target.meth_call.tsv.gz";
    print O "chromosome\tstart\tend\tread_name\tsequence\tstrand\tlog_lik_ratio\tlog_lik_methylated\tlog_lik_unmethylated\tnum_calling_strands\tnum_motifs\n";
    open I,"$CMD|$AWK|grep -v start|$bedtools sort -i -|";
    while(<I>){
        print O $_;
    }
    close I;
    close O;
    
    $CMD = $MethFreq;
    if($SplitGroups){$CMD .= " --split-groups";}
    # system("$CMD $tmpDir/$Target.meth_call.tsv.gz > $tmpDir/$Target.meth_freq.tsv");
    # 这里考虑按Strand区分甲基化结果
    open I,"gunzip -c $tmpDir/$Target.meth_call.tsv.gz|";
    open O1,"|$CMD /dev/stdin >  $tmpDir/$Target.meth_freq.tsv";
    open O2,"|$CMD /dev/stdin >  $tmpDir/$Target.meth_freq.plus.tsv";
    open O3,"|$CMD /dev/stdin >  $tmpDir/$Target.meth_freq.minus.tsv";
    $_=<I>;
    print O1 $_;
    print O2 $_;
    print O3 $_;
    while(<I>){
        print O1 $_;
        my $Strand = (split/\t/)[5];
        if($Strand eq '+'){print O2 $_;}
        if($Strand eq '-'){print O3 $_;}
    }
    close O3;
    close O2;
    close O1;
}

sub MethFreq2Hash{
    # 读取甲基化结果，并存为Hash
    my $File = shift;
    my %Hash = ();
    open J,$File;
    <J>;
    while(<J>){
        chomp;
        my @a = split/\t/;
        my $Key = "$a[0]\t$a[1]\t$a[2]\t$a[3]\t$a[-1]";
        $Hash{$Key} = "$a[4]\t$a[5]\t$a[6]";
    }
    close J;
    return(%Hash);
}
